# import necessary libraries
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import apyori as ap
from apyori import apriori #Apriori Algorithm
import mlxtend as ml
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import arules
# data formating
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('max_colwidth',100)
# read data into a pandas dataframe
df = pd.read_csv("employee_attrition.csv",sep=',')
df.head(3)
df.info()
df.describe().T
col_obj=[x for x in df.columns.to_list() if str(df[x].dtype) == 'object']
df[col_obj].describe().T
for col in col_obj:
print(df[col].value_counts())
print()
corr = df[['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
'YearsSinceLastPromotion', 'YearsWithCurrManager']].corr()
sns.heatmap(corr, cmap="YlGnBu")
import plotly.express as px
for col in col_obj:
figure = px.pie(df,df[col],title = str(col))
figure.show()
# stripping columns
df_obj = df.select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
if df.duplicated().sum()==0:
print('No duplicated rows')
df.isnull().sum()
df2 = df.dropna(axis = 0, how = 'any')
print("Number of rows with missing values deleted:", df.shape[0]-df2.shape[0])
Delete two features with no variances becuase they don't add value to the analysis.
df2['EmployeeCount'].value_counts()
del df2['EmployeeCount']
df2['StandardHours'].value_counts()
del df2['StandardHours']
df2['Over18'].value_counts()
del df2['Over18']
print("Number of columns deleted:", df.shape[1]-df2.shape[1])
print("Number of columns of df2:", df2.shape[1])
Each column has one extremely large value that needs to be deleted.
print('Maximum value of Total Working Years:', max(df2['TotalWorkingYears']))
print('Maximum value of Years With Manager:', max(df2['YearsWithCurrManager']))
print('Maximum value of Years With Manager:', max(df2['DistanceFromHome']))
df3 = df2.loc[
(df2['TotalWorkingYears']!=max(df2['TotalWorkingYears']))
& (df2["YearsWithCurrManager"]!=max(df2["YearsWithCurrManager"]))
&(df2['DistanceFromHome'] !=max(df2["DistanceFromHome"]))]
print('Number of columns with outliers deleted:', df2.shape[0]-df3.shape[0])
fig=plt.figure(figsize=(12,4)) # crate a new figure
ax0=fig.add_subplot(121)
ax0 = sns.boxplot(x=df2["TotalWorkingYears"])
ax0.set_title("With an extreme value",fontsize=10)
ax1=fig.add_subplot(122)
ax1 = sns.boxplot(x=df3["TotalWorkingYears"])
ax1.set_title("Without an extreme value",fontsize=10);
fig=plt.figure(figsize=(12,4)) # crate a new figure
ax0=fig.add_subplot(121)
ax0 = sns.boxplot(x=df2["YearsWithCurrManager"])
ax0.set_title("With an extreme value",fontsize=10)
ax1=fig.add_subplot(122)
ax1 = sns.boxplot(x=df3["YearsWithCurrManager"])
ax1.set_title("Without an extreme value",fontsize=10);
fig=plt.figure(figsize=(12,4)) # crate a new figure
ax0=fig.add_subplot(121)
ax0 = sns.boxplot(x=df2["DistanceFromHome"])
ax0.set_title("With an extreme value",fontsize=10)
ax1=fig.add_subplot(122)
ax1 = sns.boxplot(x=df3["DistanceFromHome"])
ax1.set_title("Without an extreme value",fontsize=10);
numeric_cols = df3.select_dtypes([np.number]).columns
print("Number of numeric columns",'\n',len(numeric_cols))
df[numeric_cols].hist(figsize=(15,20));
col_list = ['Education','JobInvolvement','JobLevel','PerformanceRating','RelationshipSatisfaction','WorkLifeBalance','StockOptionLevel','JobSatisfaction','EnvironmentSatisfaction','TrainingTimesLastYear']
for col in col_list:
print(df3[col].value_counts())
for col in col_list:
df3.astype({col: 'int32'}).dtypes
print(col,':',type(col))
print("Number of numeric columns transformed to objects",'\n',len(col_list))
Use pd.qcut for
# pd.qcut
# MontlyIncome
df3['MonthlyIncome_grp'] = pd.qcut(df3['MonthlyIncome'],3,labels = ['low_mon_inc','med_mon_inc','high_mon_inc'])
# YearsWithCurrManager
df3['YearsWithCurrManager_grp'] = pd.qcut(df3['YearsWithCurrManager'],3,labels = ['rec_manager','some_time_manager','long_time_manager'])
Use pd.cut for
# pd.cut
# NumCompaniesWorked
df3["NumCompaniesWorked_grp"] = pd.cut(df3["NumCompaniesWorked"], 3, labels =['few_comps','mid_comps','many_comps'])
# Age
df3['Age_grp'] = pd.cut(df3['Age'],4,labels = ['young_age','middle_age','senior_age','old_age'])
# DailyRate
df3['DailyRate_grp'] = pd.cut(df3['DailyRate'],3,labels = ['low_daily_rate','med_daily_rate','high_daily_rate'])
# DistanceFromHome
df3['DistanceFromHome_grp'] = pd.cut(df3['DistanceFromHome'],4,labels = ['walking_dist','short_dist','mid_dist','long_dist'])
# EmployeeNumber
df3['EmployeeNumber_grp'] = pd.cut(df3['EmployeeNumber'],3,labels = ['few_employ','many_employ','a_lot_employ'])
# HourlyRate
df3['HourlyRate_grp'] = pd.cut(df3['HourlyRate'],3,labels = ['low_hr_rate','med_hr_rate','high_hr_rate'])
# PercentSalaryHike
df3['PercentSalaryHike_grp'] = pd.cut(df3['PercentSalaryHike'],3,labels = ['low_per_hike','med_per_hike','high_per_hike'])
# TotalWorkingYears
df3['TotalWorkingYears_grp'] = pd.cut(df3['TotalWorkingYears'],3,labels = ['few_wrok_yrs','many_work_yrs','a_lot_work_yrs'])
# YearsAtCompany
df3['YearsAtCompany_grp'] = pd.cut(df3['YearsAtCompany'],3,labels = ['few_yrs_comp','many_yrs_comp','a_lot_yrs_comp'])
# YearsInCurrentRole
df3['YearsInCurrentRole_grp'] = pd.cut(df3['YearsInCurrentRole'],3,labels = ['few_yrs_role','many_yrs_role','a_lot_yrs_role'])
# YearsSinceLastPromotion
df3['YearsSinceLastPromotion_grp'] = pd.cut(df3['YearsSinceLastPromotion'],3,labels = ['rec_since_promo','some_time_since_promo','long_time_since_promo'])
# MontlyRate
df3['MonthlyRate_grp'] = pd.cut(df3['MonthlyRate'],3,labels = ['low_mon_rate','med_mon_rate','high_mon_rate'])
# list of columns to drop
col
col_to_drop = list(set(numeric_cols) - set(col_list))
print(col_to_drop)
# delete columns
for col in col_to_drop:
del df3[col]
print(df3.shape)
df3.head()
df4 = pd.DataFrame({col: str(col)+'=' for col in df3},
index=df3.index) + df3.astype(str)
df4.head()
# use pd.melt methond to pivot the dataframe
melted_data = pd.melt(df4)
frequency = melted_data.groupby(by=['value'])['value'].count().sort_values(ascending=True)
freq_itemset = pd.DataFrame({'item':frequency.index,'frequency':frequency.values})
g = sns.barplot(data = freq_itemset.head(10),x='item',y='frequency',color="salmon", saturation=.5)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
plt.show()
# Visualize the 10 most frequent itemsets
g = sns.barplot(data = freq_itemset.tail(10),x='item',y='frequency',color="salmon", saturation=.5)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
plt.show()
records = []
for i in range(0,len(df4)):
records.append([str(df4.values[i,j])
for j in range(0, len(df4.columns))])
records[0]
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)
# For our convenience, we can turn the encoded array into a pandas DataFrame
df5 = pd.DataFrame(te_ary, columns=te.columns_)
df5.head(3)
df5.to_csv('df5.csv',header=True, encoding = 'utf-8',index = False)
frequent_itemsets = apriori(df5,use_colnames=True)
frequent_itemsets.sort_values(by='support',ascending=False)
rules = association_rules(frequent_itemsets, metric="lift")
rules.head()
b = sns.scatterplot(data=rules,x='support',y='confidence',hue='lift')
plt.show()
frequent_itemsets2 = apriori(df5, min_support=0.6, use_colnames=True)
frequent_itemsets2.sort_values(by='support',ascending=False)
rules2 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)
rules2[(rules['lift']>1) & (rules['confidence'] > 0.5)].head(10)
fig=plt.figure(figsize=(12,4)) # crate a new figure
ax0=fig.add_subplot(121)
ax0 = sns.scatterplot(data=rules,x='support',y='confidence',hue='lift')
ax0.set_title("Baseline Model",fontsize=8)
ax1=fig.add_subplot(122)
ax1 = sns.scatterplot(data=rules2,x='support',y='confidence',hue='lift')
ax1.set_title("Parameter Tuned",fontsize=8);
df3['Attrition'].value_counts()
print('Support of Attrition=Yes:', 185/(991+185))
print('Support of Attrition=No:', 991/(991+185))
def SupervisedApriori(data,consequent,min_supp,min_conf,min_lift):
frequent_itemsets = apriori(data, min_support=min_supp, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_conf)
rules = rules[rules['lift'] > min_lift]
sup_rules = pd.DataFrame()
for i in consequent:
df = rules[rules['consequents'] == {i}]
sup_rules = sup_rules.append(df,ignore_index = True)
return(sup_rules)
att_no = SupervisedApriori(df5,consequent = ['Attrition=No'],min_supp=0.5, min_conf=0.5, min_lift=1)
att_no = att_no.sort_values(by='lift',ascending = False)
att_no.head()
att_no_2 = att_no[:]
alist = []
for i in range(0,20):
for j in range(1,20):
if set(att_no_2['antecedents'][i]).issubset(set(att_no_2['antecedents'][j])) and att_no_2['antecedents'][i]!= att_no_2['antecedents'][j]:
alist.append(i)
alist = set(alist)
alist = list(alist)
print(alist)
att_no_2.drop(df.index[alist],inplace=True)
att_no_2.head(5)
att_yes = SupervisedApriori(df5,consequent = ['Attrition=Yes'],min_supp=0.1, min_conf=0.0, min_lift=0.0)
att_yes = att_yes.sort_values(by='lift',ascending=False)
att_yes.head()
att_yes_2 = att_yes[:]
alist2 = []
for i in range(0,29):
for j in range(1,29):
if set(att_yes_2['antecedents'][i]).issubset(set(att_yes_2['antecedents'][j])) and att_yes_2['antecedents'][i]!= att_yes_2['antecedents'][j]:
alist2.append(i)
alist2 = set(alist2)
alist2 = list(alist2)
print(alist2)
att_yes_2.drop(df.index[alist2],inplace=True)
att_yes_2.head(5)
# show the full antecedents
print('Rule 2:', att_yes_2['antecedents'][27],'\n')
print('Rule 3:', att_yes_2['antecedents'][28],'\n')